import pandas as pd
from scipy.stats import pearsonr

# 1. Read in the data
dev_df = pd.read_excel("Media_6Metrics_DeviationSummary.xlsx", index_col="Media", engine="openpyxl")
ment_df = pd.read_excel("Media_Mention_Summary.xlsx",      index_col="Media", engine="openpyxl")

# 2. Merge on Media
df = dev_df.join(ment_df, how="inner")

# 3. Prepare containers
mention_vars   = ["AvgMonthlyMentions", "EventMean"]
deviation_vars = dev_df.columns.tolist()

# 4. Compute Pearson r and p-value for each pair
results = []
for dev in deviation_vars:
    for ment in mention_vars:
        # Drop any NaNs
        sub = df[[dev, ment]].dropna()
        if len(sub) < 2:
            r, p = (float("nan"), float("nan"))
        else:
            r, p = pearsonr(sub[dev], sub[ment])
        results.append({
            "DeviationMetric": dev,
            "MentionMetric": ment,
            "Pearson_r":         r,
            "p_value":           p,
            "NegativeCorr?":     r < 0  # True if lower deviation ↔ higher mentions
        })

# 5. Build results DataFrame
res_df = pd.DataFrame(results)

# 6. Save to Excel
res_df.to_excel("Correlation_Deviation_vs_Mentions.xlsx", index=False)

print("Correlation analysis complete. Results saved to 'Correlation_Deviation_vs_Mentions.xlsx'.")
